{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MiniCPM-2B 参数高效微调（LoRA）A100 80G 单卡示例\n",
    "\n",
    "显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
    "\n",
    "本 notebook 是一个使用 `OCNLI` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
    "\n",
    "## 最低硬件需求\n",
    "- 显存：12GB\n",
    "- 显卡架构：安培架构（推荐）\n",
    "- 内存：16GB"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 准备数据集\n",
    "\n",
    "将数据转换为更通用的格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 转换为 ChatML 格式\n",
    "import os\n",
    "import shutil\n",
    "import json\n",
    "\n",
    "input_dir = \"data/ocnli_public\"\n",
    "output_dir = \"data/ocnli_public_chatml\"\n",
    "if os.path.exists(output_dir):\n",
    "    shutil.rmtree(output_dir)\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "for fn in [\"train.json\", \"dev.json\"]:\n",
    "    data_out_list = []\n",
    "    with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
    "        for line in f:\n",
    "            if len(line.strip()) > 0:\n",
    "                data = json.loads(line)\n",
    "                data_out = {\n",
    "                    \"messages\": [\n",
    "                        {\n",
    "                            \"role\": \"user\",\n",
    "                            \"content\": f\"请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: {data['sentence1']}\\n句子2：{data['sentence2']}\\n\"\n",
    "                        },\n",
    "                        {\n",
    "                            \"role\": \"assistant\",\n",
    "                            \"content\": data[\"label\"],\n",
    "                        },\n",
    "                    ]\n",
    "                }\n",
    "                data_out_list.append(data_out)\n",
    "        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 使用 LoRA 进行微调\n",
    "\n",
    "命令行一键运行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20240315212836\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-03-15 21:28:38,758] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-03-15 21:28:45,799] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.\n",
      "[2024-03-15 21:28:45,799] [INFO] [runner.py:568:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=19888 --enable_each_rank_log=None finetune.py --model_name_or_path MiniCPM-2B-sft-bf16 --output_dir output/ocnli_public_chatml/20240315212836/ --train_data_path data/ocnli_public_chatml/train.json --eval_data_path data/ocnli_public_chatml/dev.json --learning_rate 5e-5 --per_device_train_batch_size 64 --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora --gradient_accumulation_steps 1 --warmup_steps 100 --max_steps 1000 --weight_decay 0.01 --evaluation_strategy steps --eval_steps 500 --save_strategy steps --save_steps 500 --seed 42 --log_level info --logging_strategy steps --logging_steps 10 --deepspeed configs/ds_config_zero3_offload.json\n",
      "[2024-03-15 21:28:47,849] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-03-15 21:28:54,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0]}\n",
      "[2024-03-15 21:28:54,905] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=1, node_rank=0\n",
      "[2024-03-15 21:28:54,905] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})\n",
      "[2024-03-15 21:28:54,905] [INFO] [launch.py:163:main] dist_world_size=1\n",
      "[2024-03-15 21:28:54,905] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0\n",
      "[2024-03-15 21:28:54,905] [INFO] [launch.py:253:main] process 86577 spawned with command: ['/usr/bin/python3', '-u', 'finetune.py', '--local_rank=0', '--model_name_or_path', 'MiniCPM-2B-sft-bf16', '--output_dir', 'output/ocnli_public_chatml/20240315212836/', '--train_data_path', 'data/ocnli_public_chatml/train.json', '--eval_data_path', 'data/ocnli_public_chatml/dev.json', '--learning_rate', '5e-5', '--per_device_train_batch_size', '64', '--per_device_eval_batch_size', '128', '--model_max_length', '128', '--bf16', '--use_lora', '--gradient_accumulation_steps', '1', '--warmup_steps', '100', '--max_steps', '1000', '--weight_decay', '0.01', '--evaluation_strategy', 'steps', '--eval_steps', '500', '--save_strategy', 'steps', '--save_steps', '500', '--seed', '42', '--log_level', 'info', '--logging_strategy', 'steps', '--logging_steps', '10', '--deepspeed', 'configs/ds_config_zero3_offload.json']\n",
      "[2024-03-15 21:29:03,964] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-03-15 21:29:04,250] [INFO] [comm.py:637:init_distributed] cdb=None\n",
      "[2024-03-15 21:29:04,250] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:836: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n",
      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
      "[2024-03-15 21:29:08,998] [INFO] [partition_parameters.py:343:__exit__] finished initializing model - num_params = 363, num_elems = 3.01B\n",
      "trainable params: 2,949,120 || all params: 2,727,830,016 || trainable%: 0.10811230841738784\n",
      "input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\n",
      "句子1: 一月份跟二月份肯定有一个月份有.\n",
      "句子2：肯定有一个月份有\n",
      " <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
      "label: entailment\n",
      "input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\n",
      "句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\n",
      "句子2：身上至少一件衣服\n",
      " <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
      "label: entailment\n",
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
      "max_steps is given, it will override any value given in num_train_epochs\n",
      "Using auto half precision backend\n",
      "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)\n",
      "Using /home/jeeves/.cache/torch_extensions/py310_cu123 as PyTorch extensions root...\n",
      "Detected CUDA files, patching ldflags\n",
      "Emitting ninja build file /home/jeeves/.cache/torch_extensions/py310_cu123/cpu_adam/build.ninja...\n",
      "Building extension module cpu_adam...\n",
      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
      "ninja: no work to do.\n",
      "Loading extension module cpu_adam...\n",
      "Time to load cpu_adam op: 2.3341457843780518 seconds\n",
      "Adam Optimizer #0 is created with AVX512 arithmetic capability.\n",
      "Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1\n",
      "[2024-03-15 21:29:15,864] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown\n",
      "[2024-03-15 21:29:15,884] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n",
      "[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer\n",
      "[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer\n",
      "[2024-03-15 21:29:15,895] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam\n",
      "[2024-03-15 21:29:15,896] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>\n",
      "[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False\n",
      "[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer\n",
      "[2024-03-15 21:29:16,049] [INFO] [utils.py:800:see_memory_usage] Stage 3 initialize beginning\n",
      "[2024-03-15 21:29:16,049] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 1.62 GB         CA 0.04 GB         Max_CA 2 GB \n",
      "[2024-03-15 21:29:16,049] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.03 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:16,053] [INFO] [stage3.py:130:__init__] Reduce bucket size 5308416\n",
      "[2024-03-15 21:29:16,053] [INFO] [stage3.py:131:__init__] Prefetch bucket size 4777574\n",
      "[2024-03-15 21:29:16,201] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]\n",
      "[2024-03-15 21:29:16,201] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 0.03 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:16,201] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.03 GB, percent = 13.7%\n",
      "Parameter Offload: Total persistent parameters: 3135744 in 241 params\n",
      "[2024-03-15 21:29:16,449] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [end]\n",
      "[2024-03-15 21:29:16,450] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.03 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:16,450] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:16,608] [INFO] [utils.py:800:see_memory_usage] Before creating fp16 partitions\n",
      "[2024-03-15 21:29:16,609] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:16,609] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:16,776] [INFO] [utils.py:800:see_memory_usage] After creating fp16 partitions: 1\n",
      "[2024-03-15 21:29:16,777] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:16,777] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:16,931] [INFO] [utils.py:800:see_memory_usage] Before creating fp32 partitions\n",
      "[2024-03-15 21:29:16,932] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:16,932] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:17,099] [INFO] [utils.py:800:see_memory_usage] After creating fp32 partitions\n",
      "[2024-03-15 21:29:17,100] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:17,100] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:17,254] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states\n",
      "[2024-03-15 21:29:17,254] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:17,254] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:17,425] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states\n",
      "[2024-03-15 21:29:17,425] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:17,425] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:17,426] [INFO] [stage3.py:486:_setup_for_real_optimizer] optimizer state initialized\n",
      "[2024-03-15 21:29:17,633] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer\n",
      "[2024-03-15 21:29:17,633] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 0.03 GB         CA 0.06 GB         Max_CA 0 GB \n",
      "[2024-03-15 21:29:17,634] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedCPUAdam\n",
      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler\n",
      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None\n",
      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)]\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:996:print] DeepSpeedEngine configuration:\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   activation_checkpointing_config  {\n",
      "    \"partition_activations\": false, \n",
      "    \"contiguous_memory_optimization\": false, \n",
      "    \"cpu_checkpointing\": false, \n",
      "    \"number_checkpoints\": null, \n",
      "    \"synchronize_checkpoint_boundary\": false, \n",
      "    \"profile\": false\n",
      "}\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   amp_enabled .................. False\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   amp_params ................... False\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   autotuning_config ............ {\n",
      "    \"enabled\": false, \n",
      "    \"start_step\": null, \n",
      "    \"end_step\": null, \n",
      "    \"metric_path\": null, \n",
      "    \"arg_mappings\": null, \n",
      "    \"metric\": \"throughput\", \n",
      "    \"model_info\": null, \n",
      "    \"results_dir\": \"autotuning_results\", \n",
      "    \"exps_dir\": \"autotuning_exps\", \n",
      "    \"overwrite\": true, \n",
      "    \"fast\": true, \n",
      "    \"start_profile_step\": 3, \n",
      "    \"end_profile_step\": 5, \n",
      "    \"tuner_type\": \"gridsearch\", \n",
      "    \"tuner_early_stopping\": 5, \n",
      "    \"tuner_num_trials\": 50, \n",
      "    \"model_info_path\": null, \n",
      "    \"mp_size\": 1, \n",
      "    \"max_train_batch_size\": null, \n",
      "    \"min_train_batch_size\": 1, \n",
      "    \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n",
      "    \"min_train_micro_batch_size_per_gpu\": 1, \n",
      "    \"num_tuning_micro_batch_sizes\": 3\n",
      "}\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   bfloat16_enabled ............. True\n",
      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   bfloat16_immediate_grad_update  False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_parallel_write_pipeline  False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_tag_validation_enabled  True\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_tag_validation_fail  False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f095baedab0>\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   communication_data_type ...... None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   compile_config ............... enabled=False backend='inductor' kwargs={}\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   curriculum_enabled_legacy .... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   curriculum_params_legacy ..... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   data_efficiency_enabled ...... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dataloader_drop_last ......... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   disable_allgather ............ False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dump_state ................... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dynamic_loss_scale_args ...... None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_enabled ........... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_gas_boundary_resolution  1\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_layer_name ........ bert.encoder.layer\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_layer_num ......... 0\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_max_iter .......... 100\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_stability ......... 1e-06\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_tol ............... 0.01\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_verbose ........... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   elasticity_enabled ........... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   flops_profiler_config ........ {\n",
      "    \"enabled\": false, \n",
      "    \"recompute_fwd_factor\": 0.0, \n",
      "    \"profile_step\": 1, \n",
      "    \"module_depth\": -1, \n",
      "    \"top_modules\": 1, \n",
      "    \"detailed\": true, \n",
      "    \"output_file\": null\n",
      "}\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_auto_cast ............... None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_enabled ................. False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_master_weights_and_gradients  False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   global_rank .................. 0\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   grad_accum_dtype ............. None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_accumulation_steps .. 1\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_clipping ............ 1.0\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_predivide_factor .... 1.0\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   graph_harvesting ............. False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   initial_dynamic_scale ........ 1\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   load_universal_checkpoint .... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   loss_scale ................... 1.0\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   memory_breakdown ............. False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   mics_hierarchial_params_gather  False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   mics_shard_size .............. -1\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   nebula_config ................ {\n",
      "    \"enabled\": false, \n",
      "    \"persistent_storage_path\": null, \n",
      "    \"persistent_time_interval\": 100, \n",
      "    \"num_of_version_in_retention\": 2, \n",
      "    \"enable_nebula_load\": true, \n",
      "    \"load_path\": null\n",
      "}\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_legacy_fusion ...... False\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_name ............... None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_params ............. None\n",
      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   pld_enabled .................. False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   pld_params ................... False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   prescale_gradients ........... False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   scheduler_name ............... None\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   scheduler_params ............. None\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   seq_parallel_communication_data_type  torch.float32\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   sparse_attention ............. None\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   sparse_gradients_enabled ..... False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   steps_per_print .............. inf\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   train_batch_size ............. 64\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   train_micro_batch_size_per_gpu  64\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   use_data_before_expert_parallel_  False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   use_node_local_storage ....... False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   wall_clock_breakdown ......... False\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   weight_quantization_config ... None\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   world_size ................... 1\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_allow_untested_optimizer  True\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=5308416 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=4777574 param_persistence_threshold=23040 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_enabled ................. True\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_force_ds_cpu_optimizer .. True\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_optimization_stage ...... 3\n",
      "[2024-03-15 21:29:17,638] [INFO] [config.py:986:print_user_config]   json = {\n",
      "    \"fp16\": {\n",
      "        \"enabled\": false, \n",
      "        \"loss_scale\": 0, \n",
      "        \"loss_scale_window\": 1000, \n",
      "        \"initial_scale_power\": 16, \n",
      "        \"hysteresis\": 2, \n",
      "        \"min_loss_scale\": 1\n",
      "    }, \n",
      "    \"bf16\": {\n",
      "        \"enabled\": true\n",
      "    }, \n",
      "    \"zero_optimization\": {\n",
      "        \"stage\": 3, \n",
      "        \"allgather_partitions\": true, \n",
      "        \"allgather_bucket_size\": 5.000000e+08, \n",
      "        \"reduce_scatter\": true, \n",
      "        \"contiguous_gradients\": true, \n",
      "        \"overlap_comm\": true, \n",
      "        \"reduce_bucket_size\": 5.308416e+06, \n",
      "        \"stage3_prefetch_bucket_size\": 4.777574e+06, \n",
      "        \"stage3_param_persistence_threshold\": 2.304000e+04, \n",
      "        \"stage3_gather_16bit_weights_on_model_save\": true, \n",
      "        \"offload_optimizer\": {\n",
      "            \"device\": \"cpu\", \n",
      "            \"pin_memory\": true\n",
      "        }, \n",
      "        \"offload_param\": {\n",
      "            \"device\": \"cpu\", \n",
      "            \"pin_memory\": true\n",
      "        }\n",
      "    }, \n",
      "    \"train_batch_size\": 64, \n",
      "    \"train_micro_batch_size_per_gpu\": 64, \n",
      "    \"gradient_accumulation_steps\": 1, \n",
      "    \"gradient_clipping\": 1.0, \n",
      "    \"wall_clock_breakdown\": false, \n",
      "    \"flops_profiler\": {\n",
      "        \"enabled\": false, \n",
      "        \"profile_step\": 1, \n",
      "        \"module_depth\": -1, \n",
      "        \"top_modules\": 1, \n",
      "        \"detailed\": true, \n",
      "        \"output_file\": null\n",
      "    }, \n",
      "    \"steps_per_print\": inf, \n",
      "    \"zero_allow_untested_optimizer\": true\n",
      "}\n",
      "***** Running training *****\n",
      "  Num examples = 50,486\n",
      "  Num Epochs = 2\n",
      "  Instantaneous batch size per device = 64\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 64\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 1,000\n",
      "  Number of trainable parameters = 2,949,120\n",
      "{'loss': 2.2004, 'grad_norm': 44.037304409869364, 'learning_rate': 5e-06, 'epoch': 0.01}\n",
      "{'loss': 1.4786, 'grad_norm': 39.531078618699645, 'learning_rate': 1e-05, 'epoch': 0.03}\n",
      "{'loss': 0.9955, 'grad_norm': 16.66467873479667, 'learning_rate': 1.5e-05, 'epoch': 0.04}\n",
      "{'loss': 0.7026, 'grad_norm': 7.417151045965821, 'learning_rate': 2e-05, 'epoch': 0.05}\n",
      "{'loss': 0.6713, 'grad_norm': 7.608669365784156, 'learning_rate': 2.5e-05, 'epoch': 0.06}\n",
      "{'loss': 0.5867, 'grad_norm': 12.552373192106195, 'learning_rate': 3e-05, 'epoch': 0.08}\n",
      "{'loss': 0.6067, 'grad_norm': 10.342863016044076, 'learning_rate': 3.5e-05, 'epoch': 0.09}\n",
      "{'loss': 0.5857, 'grad_norm': 10.985433470517048, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
      "{'loss': 0.5306, 'grad_norm': 5.22097493330033, 'learning_rate': 4.5e-05, 'epoch': 0.11}\n",
      "{'loss': 0.5517, 'grad_norm': 3.9679057507396682, 'learning_rate': 5e-05, 'epoch': 0.13}\n",
      "{'loss': 0.4573, 'grad_norm': 4.77643976524929, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.14}\n",
      "{'loss': 0.469, 'grad_norm': 7.6144285869051345, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.15}\n",
      "{'loss': 0.4748, 'grad_norm': 4.787471338888486, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.16}\n",
      "{'loss': 0.433, 'grad_norm': 3.3189167275368225, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.18}\n",
      "{'loss': 0.4282, 'grad_norm': 7.248232922110331, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.19}\n",
      "{'loss': 0.409, 'grad_norm': 6.293684915700438, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}\n",
      "{'loss': 0.4451, 'grad_norm': 3.8753855113566833, 'learning_rate': 4.6111111111111115e-05, 'epoch': 0.22}\n",
      "{'loss': 0.4288, 'grad_norm': 3.625475227512274, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.23}\n",
      "{'loss': 0.4506, 'grad_norm': 4.2449874489534665, 'learning_rate': 4.5e-05, 'epoch': 0.24}\n",
      "{'loss': 0.4484, 'grad_norm': 6.084320127673726, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.25}\n",
      "{'loss': 0.4487, 'grad_norm': 8.363684454316004, 'learning_rate': 4.388888888888889e-05, 'epoch': 0.27}\n",
      "{'loss': 0.4878, 'grad_norm': 3.747181659840593, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.28}\n",
      "{'loss': 0.412, 'grad_norm': 8.645140642353612, 'learning_rate': 4.277777777777778e-05, 'epoch': 0.29}\n",
      "{'loss': 0.4558, 'grad_norm': 4.5260457637696625, 'learning_rate': 4.222222222222222e-05, 'epoch': 0.3}\n",
      "{'loss': 0.4108, 'grad_norm': 4.781991938451388, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.32}\n",
      "{'loss': 0.4407, 'grad_norm': 5.893275628361186, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.33}\n",
      "{'loss': 0.4475, 'grad_norm': 4.100649312404707, 'learning_rate': 4.055555555555556e-05, 'epoch': 0.34}\n",
      "{'loss': 0.4041, 'grad_norm': 7.0290388233232255, 'learning_rate': 4e-05, 'epoch': 0.35}\n",
      "{'loss': 0.3599, 'grad_norm': 3.511374655086493, 'learning_rate': 3.944444444444445e-05, 'epoch': 0.37}\n",
      "{'loss': 0.4706, 'grad_norm': 5.813953833114259, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.38}\n",
      "{'loss': 0.3911, 'grad_norm': 4.0524183329331604, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.39}\n",
      "{'loss': 0.4033, 'grad_norm': 3.875046268309963, 'learning_rate': 3.777777777777778e-05, 'epoch': 0.41}\n",
      "{'loss': 0.4199, 'grad_norm': 5.059711960144461, 'learning_rate': 3.722222222222222e-05, 'epoch': 0.42}\n",
      "{'loss': 0.4216, 'grad_norm': 3.959248018825387, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.43}\n",
      "{'loss': 0.367, 'grad_norm': 4.493383842056094, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.44}\n",
      "{'loss': 0.3686, 'grad_norm': 6.826580929267439, 'learning_rate': 3.555555555555556e-05, 'epoch': 0.46}\n",
      "{'loss': 0.3566, 'grad_norm': 6.61801729550354, 'learning_rate': 3.5e-05, 'epoch': 0.47}\n",
      "{'loss': 0.3932, 'grad_norm': 4.124116051492338, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.48}\n",
      "{'loss': 0.3514, 'grad_norm': 4.545406773056064, 'learning_rate': 3.388888888888889e-05, 'epoch': 0.49}\n",
      "{'loss': 0.4364, 'grad_norm': 5.868492580695467, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.51}\n",
      "{'loss': 0.346, 'grad_norm': 5.245615445258653, 'learning_rate': 3.277777777777778e-05, 'epoch': 0.52}\n",
      "{'loss': 0.335, 'grad_norm': 3.6031965739940257, 'learning_rate': 3.222222222222223e-05, 'epoch': 0.53}\n",
      "{'loss': 0.37, 'grad_norm': 5.240535743057915, 'learning_rate': 3.1666666666666666e-05, 'epoch': 0.54}\n",
      "{'loss': 0.3732, 'grad_norm': 7.290964612314844, 'learning_rate': 3.111111111111111e-05, 'epoch': 0.56}\n",
      "{'loss': 0.378, 'grad_norm': 5.352972449129333, 'learning_rate': 3.055555555555556e-05, 'epoch': 0.57}\n",
      "{'loss': 0.3512, 'grad_norm': 3.2834858860521705, 'learning_rate': 3e-05, 'epoch': 0.58}\n",
      "{'loss': 0.3963, 'grad_norm': 5.047726585891225, 'learning_rate': 2.9444444444444448e-05, 'epoch': 0.6}\n",
      "{'loss': 0.3825, 'grad_norm': 3.6864211233732562, 'learning_rate': 2.8888888888888888e-05, 'epoch': 0.61}\n",
      "{'loss': 0.3715, 'grad_norm': 4.97593217867295, 'learning_rate': 2.8333333333333335e-05, 'epoch': 0.62}\n",
      "{'loss': 0.4358, 'grad_norm': 5.702141663942072, 'learning_rate': 2.777777777777778e-05, 'epoch': 0.63}\n",
      " 50%|████████████████████                    | 500/1000 [10:03<09:58,  1.20s/it]***** Running Evaluation *****\n",
      "  Num examples = 3000\n",
      "  Batch size = 128\n",
      "\n",
      "  0%|                                                    | 0/24 [00:00<?, ?it/s]\u001b[A\n",
      "  8%|███▋                                        | 2/24 [00:01<00:19,  1.11it/s]\u001b[A\n",
      " 12%|█████▌                                      | 3/24 [00:02<00:18,  1.11it/s]\u001b[A\n",
      " 17%|███████▎                                    | 4/24 [00:03<00:17,  1.11it/s]\u001b[A\n",
      " 21%|█████████▏                                  | 5/24 [00:04<00:17,  1.11it/s]\u001b[A\n",
      " 25%|███████████                                 | 6/24 [00:05<00:16,  1.11it/s]\u001b[A\n",
      " 29%|████████████▊                               | 7/24 [00:06<00:15,  1.11it/s]\u001b[A\n",
      " 33%|██████████████▋                             | 8/24 [00:07<00:14,  1.11it/s]\u001b[A\n",
      " 38%|████████████████▌                           | 9/24 [00:08<00:13,  1.11it/s]\u001b[A\n",
      " 42%|█████████████████▉                         | 10/24 [00:08<00:12,  1.11it/s]\u001b[A\n",
      " 46%|███████████████████▋                       | 11/24 [00:09<00:11,  1.11it/s]\u001b[A\n",
      " 50%|█████████████████████▌                     | 12/24 [00:10<00:10,  1.11it/s]\u001b[A\n",
      " 54%|███████████████████████▎                   | 13/24 [00:11<00:09,  1.11it/s]\u001b[A\n",
      " 58%|█████████████████████████                  | 14/24 [00:12<00:08,  1.11it/s]\u001b[A\n",
      " 62%|██████████████████████████▉                | 15/24 [00:13<00:08,  1.11it/s]\u001b[A\n",
      " 67%|████████████████████████████▋              | 16/24 [00:14<00:07,  1.11it/s]\u001b[A\n",
      " 71%|██████████████████████████████▍            | 17/24 [00:15<00:06,  1.11it/s]\u001b[A\n",
      " 75%|████████████████████████████████▎          | 18/24 [00:16<00:05,  1.11it/s]\u001b[A\n",
      " 79%|██████████████████████████████████         | 19/24 [00:17<00:04,  1.11it/s]\u001b[A\n",
      " 83%|███████████████████████████████████▊       | 20/24 [00:17<00:03,  1.11it/s]\u001b[A\n",
      " 88%|█████████████████████████████████████▋     | 21/24 [00:18<00:02,  1.11it/s]\u001b[A\n",
      " 92%|███████████████████████████████████████▍   | 22/24 [00:19<00:01,  1.11it/s]\u001b[A\n",
      " 96%|█████████████████████████████████████████▏ | 23/24 [00:20<00:00,  1.12it/s]\u001b[A\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.4814399480819702, 'eval_runtime': 23.5015, 'eval_samples_per_second': 127.651, 'eval_steps_per_second': 1.021, 'epoch': 0.63}\n",
      " 50%|████████████████████                    | 500/1000 [10:26<09:58,  1.20s/it]\n",
      "100%|███████████████████████████████████████████| 24/24 [00:21<00:00,  1.22it/s]\u001b[A\n",
      "                                                                                \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500\n",
      "tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/tokenizer_config.json\n",
      "Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/special_tokens_map.json\n",
      "[2024-03-15 21:39:48,407] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is about to be saved!\n",
      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
      "  warnings.warn(\n",
      "[2024-03-15 21:39:48,447] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
      "[2024-03-15 21:39:48,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
      "[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
      "[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
      "[2024-03-15 21:39:48,493] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
      "[2024-03-15 21:39:48,493] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
      "[2024-03-15 21:39:48,498] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step500 is ready now!\n",
      "[2024-03-15 21:39:49,718] [WARNING] [stage3.py:2069:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
      "{'loss': 0.4598, 'grad_norm': 6.5312558406821974, 'learning_rate': 2.7222222222222223e-05, 'epoch': 0.65}\n",
      "{'loss': 0.355, 'grad_norm': 3.9302654106847914, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.66}\n",
      "{'loss': 0.3781, 'grad_norm': 4.25997203692361, 'learning_rate': 2.6111111111111114e-05, 'epoch': 0.67}\n",
      "{'loss': 0.3668, 'grad_norm': 3.5989513406349776, 'learning_rate': 2.5555555555555554e-05, 'epoch': 0.68}\n",
      "{'loss': 0.3585, 'grad_norm': 3.6575850959103717, 'learning_rate': 2.5e-05, 'epoch': 0.7}\n",
      "{'loss': 0.3674, 'grad_norm': 4.911812708486751, 'learning_rate': 2.4444444444444445e-05, 'epoch': 0.71}\n",
      "{'loss': 0.368, 'grad_norm': 4.194735979358348, 'learning_rate': 2.3888888888888892e-05, 'epoch': 0.72}\n",
      "{'loss': 0.3891, 'grad_norm': 3.5460606114800868, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.74}\n",
      "{'loss': 0.3977, 'grad_norm': 3.150838310468473, 'learning_rate': 2.277777777777778e-05, 'epoch': 0.75}\n",
      "{'loss': 0.3533, 'grad_norm': 3.9069432978502756, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.76}\n",
      "{'loss': 0.3811, 'grad_norm': 5.105086367004499, 'learning_rate': 2.1666666666666667e-05, 'epoch': 0.77}\n",
      "{'loss': 0.325, 'grad_norm': 4.369369589510735, 'learning_rate': 2.111111111111111e-05, 'epoch': 0.79}\n",
      "{'loss': 0.3641, 'grad_norm': 6.171511559710524, 'learning_rate': 2.0555555555555555e-05, 'epoch': 0.8}\n",
      "{'loss': 0.3316, 'grad_norm': 3.7044215769355313, 'learning_rate': 2e-05, 'epoch': 0.81}\n",
      "{'loss': 0.3898, 'grad_norm': 3.788686076864363, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.82}\n",
      "{'loss': 0.3732, 'grad_norm': 6.75853923792821, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.84}\n",
      "{'loss': 0.3827, 'grad_norm': 5.165864430975117, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.85}\n",
      "{'loss': 0.3565, 'grad_norm': 3.535604172460323, 'learning_rate': 1.777777777777778e-05, 'epoch': 0.86}\n",
      "{'loss': 0.3345, 'grad_norm': 3.633280931030727, 'learning_rate': 1.7222222222222224e-05, 'epoch': 0.87}\n",
      "{'loss': 0.3639, 'grad_norm': 4.485584268777012, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.89}\n",
      "{'loss': 0.402, 'grad_norm': 3.7925929660253317, 'learning_rate': 1.6111111111111115e-05, 'epoch': 0.9}\n",
      "{'loss': 0.3452, 'grad_norm': 5.183220810399684, 'learning_rate': 1.5555555555555555e-05, 'epoch': 0.91}\n",
      "{'loss': 0.3936, 'grad_norm': 9.733180087550997, 'learning_rate': 1.5e-05, 'epoch': 0.93}\n",
      "{'loss': 0.3367, 'grad_norm': 5.1834921923924755, 'learning_rate': 1.4444444444444444e-05, 'epoch': 0.94}\n",
      "{'loss': 0.3681, 'grad_norm': 4.409917292781669, 'learning_rate': 1.388888888888889e-05, 'epoch': 0.95}\n",
      "{'loss': 0.3348, 'grad_norm': 3.335369553115092, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.96}\n",
      "{'loss': 0.3972, 'grad_norm': 5.322684365694768, 'learning_rate': 1.2777777777777777e-05, 'epoch': 0.98}\n",
      "{'loss': 0.3835, 'grad_norm': 6.105565593241867, 'learning_rate': 1.2222222222222222e-05, 'epoch': 0.99}\n",
      "{'loss': 0.3916, 'grad_norm': 3.49133044485143, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.0}\n",
      "{'loss': 0.3597, 'grad_norm': 2.888336925676786, 'learning_rate': 1.1111111111111112e-05, 'epoch': 1.01}\n",
      "{'loss': 0.3304, 'grad_norm': 2.9537925974792714, 'learning_rate': 1.0555555555555555e-05, 'epoch': 1.03}\n",
      "{'loss': 0.3392, 'grad_norm': 5.712451906231322, 'learning_rate': 1e-05, 'epoch': 1.04}\n",
      "{'loss': 0.3393, 'grad_norm': 5.12273971212701, 'learning_rate': 9.444444444444445e-06, 'epoch': 1.05}\n",
      "{'loss': 0.3018, 'grad_norm': 3.2845513584107033, 'learning_rate': 8.88888888888889e-06, 'epoch': 1.06}\n",
      "{'loss': 0.3384, 'grad_norm': 3.2604963558968145, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.08}\n",
      "{'loss': 0.3252, 'grad_norm': 6.04878965518926, 'learning_rate': 7.777777777777777e-06, 'epoch': 1.09}\n",
      "{'loss': 0.384, 'grad_norm': 5.226938733071884, 'learning_rate': 7.222222222222222e-06, 'epoch': 1.1}\n",
      "{'loss': 0.2914, 'grad_norm': 3.8905566106093925, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.12}\n",
      "{'loss': 0.2984, 'grad_norm': 3.3599598929872525, 'learning_rate': 6.111111111111111e-06, 'epoch': 1.13}\n",
      "{'loss': 0.3459, 'grad_norm': 5.669365782344921, 'learning_rate': 5.555555555555556e-06, 'epoch': 1.14}\n",
      "{'loss': 0.3393, 'grad_norm': 3.078993311756746, 'learning_rate': 5e-06, 'epoch': 1.15}\n",
      "{'loss': 0.3314, 'grad_norm': 5.3827552737002495, 'learning_rate': 4.444444444444445e-06, 'epoch': 1.17}\n",
      "{'loss': 0.3345, 'grad_norm': 3.2322873367016665, 'learning_rate': 3.888888888888889e-06, 'epoch': 1.18}\n",
      "{'loss': 0.3363, 'grad_norm': 3.3300669560846425, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.19}\n",
      "{'loss': 0.344, 'grad_norm': 3.7589742724407653, 'learning_rate': 2.777777777777778e-06, 'epoch': 1.2}\n",
      "{'loss': 0.3195, 'grad_norm': 2.8061902793867626, 'learning_rate': 2.2222222222222225e-06, 'epoch': 1.22}\n",
      "{'loss': 0.3128, 'grad_norm': 3.3215568095822516, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.23}\n",
      "{'loss': 0.3035, 'grad_norm': 4.30331459929754, 'learning_rate': 1.1111111111111112e-06, 'epoch': 1.24}\n",
      "{'loss': 0.3374, 'grad_norm': 3.9324447635716995, 'learning_rate': 5.555555555555556e-07, 'epoch': 1.25}\n",
      "{'loss': 0.3254, 'grad_norm': 4.112509804571923, 'learning_rate': 0.0, 'epoch': 1.27}\n",
      "100%|███████████████████████████████████████| 1000/1000 [20:30<00:00,  1.19s/it]***** Running Evaluation *****\n",
      "  Num examples = 3000\n",
      "  Batch size = 128\n",
      "\n",
      "  0%|                                                    | 0/24 [00:00<?, ?it/s]\u001b[A\n",
      "  8%|███▋                                        | 2/24 [00:00<00:09,  2.23it/s]\u001b[A\n",
      " 12%|█████▌                                      | 3/24 [00:01<00:13,  1.58it/s]\u001b[A\n",
      " 17%|███████▎                                    | 4/24 [00:02<00:14,  1.37it/s]\u001b[A\n",
      " 21%|█████████▏                                  | 5/24 [00:03<00:14,  1.27it/s]\u001b[A\n",
      " 25%|███████████                                 | 6/24 [00:04<00:14,  1.21it/s]\u001b[A\n",
      " 29%|████████████▊                               | 7/24 [00:05<00:14,  1.18it/s]\u001b[A\n",
      " 33%|██████████████▋                             | 8/24 [00:06<00:13,  1.16it/s]\u001b[A\n",
      " 38%|████████████████▌                           | 9/24 [00:07<00:13,  1.14it/s]\u001b[A\n",
      " 42%|█████████████████▉                         | 10/24 [00:08<00:12,  1.13it/s]\u001b[A\n",
      " 46%|███████████████████▋                       | 11/24 [00:08<00:11,  1.13it/s]\u001b[A\n",
      " 50%|█████████████████████▌                     | 12/24 [00:09<00:10,  1.12it/s]\u001b[A\n",
      " 54%|███████████████████████▎                   | 13/24 [00:10<00:09,  1.12it/s]\u001b[A\n",
      " 58%|█████████████████████████                  | 14/24 [00:11<00:08,  1.12it/s]\u001b[A\n",
      " 62%|██████████████████████████▉                | 15/24 [00:12<00:08,  1.12it/s]\u001b[A\n",
      " 67%|████████████████████████████▋              | 16/24 [00:13<00:07,  1.12it/s]\u001b[A\n",
      " 71%|██████████████████████████████▍            | 17/24 [00:14<00:06,  1.11it/s]\u001b[A\n",
      " 75%|████████████████████████████████▎          | 18/24 [00:15<00:05,  1.12it/s]\u001b[A\n",
      " 79%|██████████████████████████████████         | 19/24 [00:16<00:04,  1.12it/s]\u001b[A\n",
      " 83%|███████████████████████████████████▊       | 20/24 [00:17<00:03,  1.12it/s]\u001b[A\n",
      " 88%|█████████████████████████████████████▋     | 21/24 [00:17<00:02,  1.12it/s]\u001b[A\n",
      " 92%|███████████████████████████████████████▍   | 22/24 [00:18<00:01,  1.12it/s]\u001b[A\n",
      " 96%|█████████████████████████████████████████▏ | 23/24 [00:19<00:00,  1.12it/s]\u001b[A\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.414621502161026, 'eval_runtime': 21.2011, 'eval_samples_per_second': 141.502, 'eval_steps_per_second': 1.132, 'epoch': 1.27}\n",
      "100%|███████████████████████████████████████| 1000/1000 [20:52<00:00,  1.19s/it]\n",
      "100%|███████████████████████████████████████████| 24/24 [00:20<00:00,  1.22it/s]\u001b[A\n",
      "                                                                                \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000\n",
      "tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/tokenizer_config.json\n",
      "Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/special_tokens_map.json\n",
      "[2024-03-15 21:50:12,793] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is about to be saved!\n",
      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
      "  warnings.warn(\n",
      "[2024-03-15 21:50:12,809] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
      "[2024-03-15 21:50:12,809] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
      "[2024-03-15 21:50:12,817] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
      "[2024-03-15 21:50:12,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
      "[2024-03-15 21:50:12,851] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
      "[2024-03-15 21:50:12,852] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
      "[2024-03-15 21:50:12,856] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1000 is ready now!\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "{'train_runtime': 1255.2202, 'train_samples_per_second': 50.987, 'train_steps_per_second': 0.797, 'train_loss': 0.43027476024627687, 'epoch': 1.27}\n",
      "100%|███████████████████████████████████████| 1000/1000 [20:55<00:00,  1.26s/it]\n",
      "[2024-03-15 21:50:18,203] [INFO] [launch.py:348:main] Process 86577 exits successfully.\n"
     ]
    }
   ],
   "source": [
    "!bash lora_finetune_ocnli.sh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 推理验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"output/ocnli_public_chatml/20240316002856/checkpoint-1500\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(path)\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('entailment',\n",
       " [{'role': 'user',\n",
       "   'content': '<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2：身上至少一件衣服\\n<AI>'},\n",
       "  {'role': 'assistant', 'content': 'entailment'}])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res, history = model.chat(tokenizer, query=\"<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2：身上至少一件衣服\\n<AI>\")\n",
    "res, history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"data/ocnli_public_chatml/dev.json\", 'r') as f:\n",
    "    dev_sample_list = json.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  0%|          | 1/500 [00:00<00:54,  9.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  0%|          | 2/500 [00:00<00:54,  9.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  1%|          | 3/500 [00:00<00:55,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  1%|          | 5/500 [00:00<00:49,  9.99it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  1%|          | 6/500 [00:00<00:51,  9.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  2%|▏         | 8/500 [00:00<00:44, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  2%|▏         | 10/500 [00:00<00:48, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  2%|▏         | 12/500 [00:01<00:47, 10.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  3%|▎         | 14/500 [00:01<00:46, 10.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  3%|▎         | 16/500 [00:01<00:42, 11.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  4%|▎         | 18/500 [00:01<00:40, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  4%|▍         | 20/500 [00:01<00:38, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  4%|▍         | 22/500 [00:02<00:42, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  5%|▍         | 24/500 [00:02<00:40, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  5%|▌         | 26/500 [00:02<00:38, 12.32it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  6%|▌         | 28/500 [00:02<00:39, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  6%|▌         | 30/500 [00:02<00:40, 11.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  6%|▋         | 32/500 [00:02<00:41, 11.34it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  7%|▋         | 34/500 [00:03<00:39, 11.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  7%|▋         | 36/500 [00:03<00:37, 12.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  8%|▊         | 38/500 [00:03<00:38, 11.92it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  8%|▊         | 40/500 [00:03<00:37, 12.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  8%|▊         | 42/500 [00:03<00:36, 12.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  9%|▉         | 44/500 [00:03<00:40, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "  9%|▉         | 46/500 [00:04<00:37, 11.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 10%|▉         | 48/500 [00:04<00:38, 11.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 10%|█         | 50/500 [00:04<00:36, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 10%|█         | 52/500 [00:04<00:35, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 11%|█         | 54/500 [00:04<00:34, 12.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 11%|█         | 56/500 [00:04<00:34, 13.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 12%|█▏        | 58/500 [00:05<00:38, 11.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 12%|█▏        | 60/500 [00:05<00:38, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 12%|█▏        | 62/500 [00:05<00:36, 11.97it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 13%|█▎        | 64/500 [00:05<00:35, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 13%|█▎        | 66/500 [00:05<00:36, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 14%|█▎        | 68/500 [00:05<00:37, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 14%|█▍        | 70/500 [00:06<00:39, 10.75it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 14%|█▍        | 72/500 [00:06<00:41, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 15%|█▍        | 74/500 [00:06<00:40, 10.44it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 15%|█▌        | 76/500 [00:06<00:38, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 16%|█▌        | 78/500 [00:06<00:36, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 16%|█▌        | 80/500 [00:06<00:37, 11.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 16%|█▋        | 82/500 [00:07<00:37, 11.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 17%|█▋        | 84/500 [00:07<00:37, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 17%|█▋        | 86/500 [00:07<00:37, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 18%|█▊        | 88/500 [00:07<00:37, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 18%|█▊        | 90/500 [00:07<00:34, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 18%|█▊        | 92/500 [00:08<00:35, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 19%|█▉        | 94/500 [00:08<00:33, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 19%|█▉        | 96/500 [00:08<00:34, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 20%|█▉        | 98/500 [00:08<00:32, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 20%|██        | 100/500 [00:08<00:33, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 20%|██        | 102/500 [00:08<00:34, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 21%|██        | 104/500 [00:08<00:32, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 21%|██        | 106/500 [00:09<00:31, 12.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 22%|██▏       | 108/500 [00:09<00:30, 12.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 22%|██▏       | 110/500 [00:09<00:29, 13.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 22%|██▏       | 112/500 [00:09<00:29, 13.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 23%|██▎       | 114/500 [00:09<00:28, 13.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 23%|██▎       | 116/500 [00:09<00:28, 13.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 24%|██▎       | 118/500 [00:10<00:31, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 24%|██▍       | 120/500 [00:10<00:33, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 24%|██▍       | 122/500 [00:10<00:33, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 25%|██▍       | 124/500 [00:10<00:31, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 25%|██▌       | 126/500 [00:10<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 26%|██▌       | 128/500 [00:10<00:31, 11.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 26%|██▌       | 130/500 [00:11<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 26%|██▋       | 132/500 [00:11<00:29, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 27%|██▋       | 134/500 [00:11<00:32, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 27%|██▋       | 136/500 [00:11<00:34, 10.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 28%|██▊       | 138/500 [00:11<00:33, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 28%|██▊       | 140/500 [00:12<00:33, 10.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 28%|██▊       | 142/500 [00:12<00:31, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 29%|██▉       | 144/500 [00:12<00:31, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 29%|██▉       | 146/500 [00:12<00:29, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 30%|██▉       | 148/500 [00:12<00:30, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 30%|███       | 150/500 [00:12<00:28, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 30%|███       | 152/500 [00:12<00:27, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 31%|███       | 154/500 [00:13<00:28, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 31%|███       | 156/500 [00:13<00:27, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 32%|███▏      | 158/500 [00:13<00:26, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 32%|███▏      | 160/500 [00:13<00:27, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 32%|███▏      | 162/500 [00:13<00:30, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 33%|███▎      | 164/500 [00:14<00:30, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 33%|███▎      | 166/500 [00:14<00:28, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 34%|███▎      | 168/500 [00:14<00:27, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 34%|███▍      | 170/500 [00:14<00:29, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 34%|███▍      | 172/500 [00:14<00:29, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 35%|███▍      | 174/500 [00:14<00:27, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 35%|███▌      | 176/500 [00:15<00:26, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 36%|███▌      | 178/500 [00:15<00:25, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 36%|███▌      | 180/500 [00:15<00:24, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 36%|███▋      | 182/500 [00:15<00:26, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 37%|███▋      | 184/500 [00:15<00:26, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 37%|███▋      | 186/500 [00:15<00:25, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 38%|███▊      | 188/500 [00:16<00:26, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 38%|███▊      | 190/500 [00:16<00:28, 10.88it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 38%|███▊      | 192/500 [00:16<00:26, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 39%|███▉      | 194/500 [00:16<00:26, 11.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 39%|███▉      | 196/500 [00:16<00:27, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 40%|███▉      | 198/500 [00:16<00:25, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 40%|████      | 200/500 [00:17<00:24, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 40%|████      | 202/500 [00:17<00:23, 12.59it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 41%|████      | 204/500 [00:17<00:24, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 41%|████      | 206/500 [00:17<00:25, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 42%|████▏     | 208/500 [00:17<00:25, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 42%|████▏     | 210/500 [00:17<00:25, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 42%|████▏     | 212/500 [00:18<00:24, 11.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 43%|████▎     | 214/500 [00:18<00:24, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 43%|████▎     | 216/500 [00:18<00:23, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 44%|████▎     | 218/500 [00:18<00:25, 11.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 44%|████▍     | 220/500 [00:18<00:23, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 44%|████▍     | 222/500 [00:18<00:24, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 45%|████▍     | 224/500 [00:19<00:25, 10.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 45%|████▌     | 226/500 [00:19<00:25, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 46%|████▌     | 228/500 [00:19<00:25, 10.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 46%|████▌     | 230/500 [00:19<00:24, 10.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 46%|████▋     | 232/500 [00:19<00:24, 10.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 47%|████▋     | 234/500 [00:20<00:26, 10.16it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 47%|████▋     | 236/500 [00:20<00:25, 10.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 48%|████▊     | 238/500 [00:20<00:23, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 48%|████▊     | 240/500 [00:20<00:22, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 48%|████▊     | 242/500 [00:20<00:23, 10.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 49%|████▉     | 244/500 [00:21<00:22, 11.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 49%|████▉     | 246/500 [00:21<00:23, 10.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 50%|████▉     | 248/500 [00:21<00:24, 10.22it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 50%|█████     | 250/500 [00:21<00:22, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 50%|█████     | 252/500 [00:21<00:21, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 51%|█████     | 254/500 [00:21<00:20, 12.27it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 51%|█████     | 256/500 [00:22<00:20, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 52%|█████▏    | 258/500 [00:22<00:19, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 52%|█████▏    | 260/500 [00:22<00:19, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 52%|█████▏    | 262/500 [00:22<00:18, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 53%|█████▎    | 264/500 [00:22<00:19, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 53%|█████▎    | 266/500 [00:22<00:18, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 54%|█████▎    | 268/500 [00:22<00:18, 12.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 54%|█████▍    | 270/500 [00:23<00:17, 13.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 54%|█████▍    | 272/500 [00:23<00:17, 13.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 55%|█████▍    | 274/500 [00:23<00:17, 13.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 55%|█████▌    | 276/500 [00:23<00:17, 12.48it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 56%|█████▌    | 278/500 [00:23<00:17, 12.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 56%|█████▌    | 280/500 [00:23<00:17, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 56%|█████▋    | 282/500 [00:24<00:17, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 57%|█████▋    | 284/500 [00:24<00:17, 12.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 57%|█████▋    | 286/500 [00:24<00:18, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 58%|█████▊    | 288/500 [00:24<00:18, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 58%|█████▊    | 290/500 [00:24<00:20, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 58%|█████▊    | 292/500 [00:25<00:19, 10.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 59%|█████▉    | 294/500 [00:25<00:19, 10.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 59%|█████▉    | 296/500 [00:25<00:17, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 60%|█████▉    | 298/500 [00:25<00:17, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 60%|██████    | 300/500 [00:25<00:17, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 60%|██████    | 302/500 [00:25<00:17, 11.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 61%|██████    | 304/500 [00:26<00:16, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 61%|██████    | 306/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 62%|██████▏   | 308/500 [00:26<00:16, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 62%|██████▏   | 310/500 [00:26<00:16, 11.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 62%|██████▏   | 312/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 63%|██████▎   | 314/500 [00:26<00:16, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 63%|██████▎   | 316/500 [00:27<00:16, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 64%|██████▎   | 318/500 [00:27<00:16, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 64%|██████▍   | 320/500 [00:27<00:15, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 64%|██████▍   | 322/500 [00:27<00:14, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 65%|██████▍   | 324/500 [00:27<00:13, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 65%|██████▌   | 326/500 [00:27<00:14, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 66%|██████▌   | 328/500 [00:28<00:13, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 66%|██████▌   | 330/500 [00:28<00:14, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 66%|██████▋   | 332/500 [00:28<00:13, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 67%|██████▋   | 334/500 [00:28<00:14, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 67%|██████▋   | 336/500 [00:28<00:13, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 68%|██████▊   | 338/500 [00:28<00:13, 11.78it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 68%|██████▊   | 340/500 [00:29<00:13, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 68%|██████▊   | 342/500 [00:29<00:12, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 69%|██████▉   | 344/500 [00:29<00:12, 12.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 69%|██████▉   | 346/500 [00:29<00:13, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 70%|██████▉   | 348/500 [00:29<00:14, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 70%|███████   | 350/500 [00:30<00:14, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 70%|███████   | 352/500 [00:30<00:14, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 71%|███████   | 354/500 [00:30<00:13, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 71%|███████   | 356/500 [00:30<00:12, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 72%|███████▏  | 358/500 [00:30<00:13, 10.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 72%|███████▏  | 360/500 [00:31<00:13, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 72%|███████▏  | 362/500 [00:31<00:12, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 73%|███████▎  | 364/500 [00:31<00:12, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 73%|███████▎  | 366/500 [00:31<00:12, 10.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 74%|███████▎  | 368/500 [00:31<00:13,  9.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 74%|███████▍  | 370/500 [00:31<00:12, 10.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 74%|███████▍  | 372/500 [00:32<00:12,  9.95it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 75%|███████▍  | 374/500 [00:32<00:12, 10.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 75%|███████▌  | 376/500 [00:32<00:11, 10.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 76%|███████▌  | 378/500 [00:32<00:11, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 76%|███████▌  | 380/500 [00:32<00:11, 10.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 76%|███████▋  | 382/500 [00:33<00:11, 10.05it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 77%|███████▋  | 384/500 [00:33<00:11, 10.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 77%|███████▋  | 386/500 [00:33<00:10, 10.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 78%|███████▊  | 388/500 [00:33<00:10, 10.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 78%|███████▊  | 390/500 [00:33<00:10, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 78%|███████▊  | 392/500 [00:34<00:10, 10.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 79%|███████▉  | 394/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 79%|███████▉  | 396/500 [00:34<00:08, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 80%|███████▉  | 398/500 [00:34<00:08, 12.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 80%|████████  | 400/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 80%|████████  | 402/500 [00:34<00:08, 11.54it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 81%|████████  | 404/500 [00:35<00:08, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 81%|████████  | 406/500 [00:35<00:07, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 82%|████████▏ | 408/500 [00:35<00:07, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 82%|████████▏ | 410/500 [00:35<00:07, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 82%|████████▏ | 412/500 [00:35<00:08, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 83%|████████▎ | 414/500 [00:35<00:07, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 83%|████████▎ | 416/500 [00:36<00:06, 12.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 84%|████████▎ | 418/500 [00:36<00:07, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 84%|████████▍ | 420/500 [00:36<00:06, 12.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 84%|████████▍ | 422/500 [00:36<00:06, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 85%|████████▍ | 424/500 [00:36<00:06, 12.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 85%|████████▌ | 426/500 [00:36<00:06, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 86%|████████▌ | 428/500 [00:37<00:05, 12.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 86%|████████▌ | 430/500 [00:37<00:05, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 86%|████████▋ | 432/500 [00:37<00:06, 11.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 87%|████████▋ | 434/500 [00:37<00:05, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 87%|████████▋ | 436/500 [00:37<00:05, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 88%|████████▊ | 438/500 [00:37<00:04, 12.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 88%|████████▊ | 440/500 [00:38<00:04, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 88%|████████▊ | 442/500 [00:38<00:04, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 89%|████████▉ | 444/500 [00:38<00:04, 12.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 89%|████████▉ | 446/500 [00:38<00:04, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 90%|████████▉ | 448/500 [00:38<00:03, 13.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 90%|█████████ | 450/500 [00:38<00:04, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 90%|█████████ | 452/500 [00:39<00:04, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 91%|█████████ | 454/500 [00:39<00:03, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 91%|█████████ | 456/500 [00:39<00:03, 12.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 92%|█████████▏| 458/500 [00:39<00:03, 12.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 92%|█████████▏| 460/500 [00:39<00:03, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 92%|█████████▏| 462/500 [00:39<00:03, 11.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 93%|█████████▎| 464/500 [00:40<00:03, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 93%|█████████▎| 466/500 [00:40<00:03, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 94%|█████████▎| 468/500 [00:40<00:03, 10.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 94%|█████████▍| 470/500 [00:40<00:02, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 94%|█████████▍| 472/500 [00:40<00:02, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 95%|█████████▍| 474/500 [00:40<00:02, 11.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 95%|█████████▌| 476/500 [00:41<00:02, 11.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 96%|█████████▌| 478/500 [00:41<00:01, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 96%|█████████▌| 480/500 [00:41<00:01, 12.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 96%|█████████▋| 482/500 [00:41<00:01, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 97%|█████████▋| 484/500 [00:41<00:01, 12.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 97%|█████████▋| 486/500 [00:41<00:01, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 98%|█████████▊| 488/500 [00:42<00:01, 11.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 98%|█████████▊| 490/500 [00:42<00:00, 12.28it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 98%|█████████▊| 492/500 [00:42<00:00, 11.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 99%|█████████▉| 494/500 [00:42<00:00, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      " 99%|█████████▉| 496/500 [00:42<00:00, 10.40it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "100%|█████████▉| 498/500 [00:43<00:00, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
      "100%|██████████| 500/500 [00:43<00:00, 11.56it/s]\n"
     ]
    }
   ],
   "source": [
    "pos = 0\n",
    "neg = 0\n",
    "for sample in tqdm(dev_sample_list[:500]):\n",
    "    res, history = model.chat(tokenizer, query=\"<用户>{}<AI>\".format(sample[\"messages\"][0][\"content\"]), max_length=128, top_p=0.5, temperature=0.8)\n",
    "    if sample[\"messages\"][1][\"content\"] in res.strip().lower():\n",
    "        pos += 1\n",
    "    else:\n",
    "        neg += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.81, 405, 95)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos / (pos+neg), pos, neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
